# -*- coding: utf-8 -*-
import scrapy
from scrapy import log
from bs4 import BeautifulSoup
from datetime import date,timedelta
from spider_news_all.items import SpiderNewsAllItem
import threading
import MySQLdb
import traceback


class SznewsSpider(scrapy.Spider):
    name = "sznews"
    allowed_domains = ["www.sznews.com"]
    start_urls = ["http://www.sznews.com/news/node_141128.htm",
                  "http://www.sznews.com/news/node_150507.htm",
                  "http://www.sznews.com/news/node_237320.htm",
                  "http://www.sznews.com/news/node_109926.htm"]

    handle_httpstatus_list = [521]

    FLAG_INTERRUPT = False
    SELECT_NEWS_BY_TITLE_AND_URL = "SELECT * FROM news_all WHERE title='%s' AND url='%s'"

    lock = threading.RLock()
    conn = MySQLdb.connect(host='119.29.197.217', port=3306, user='tytt', passwd='Tkys#_@12$#ttR', db='news',
                           autocommit=True)
    conn.set_character_set('utf8')
    cursor = conn.cursor()
    cursor.execute('SET NAMES utf8;')
    cursor.execute('SET CHARACTER SET utf8;')
    cursor.execute('SET character_set_connection=utf8;')

    def is_news_not_saved(self, title, url):
        if self.FLAG_INTERRUPT:
            self.lock.acquire()
            rows = self.cursor.execute(self.SELECT_NEWS_BY_TITLE_AND_URL % (title, url))
            if rows > 0:
                log.msg("News saved all finished.", level=log.INFO)
                return False
            else:
                return True
            self.lock.release()
        else:
            return True

    def parse(self, response):
        log.msg("Start to parse page " + response.url, level=log.INFO)
        items = []
        links = response.xpath('//div[@class="list-pt"]/ul/li[@class="list-pt-li cf"]/a[1]/@href').extract()
        need_parse_next_page = True
        for i in range(0,len(links)):
            if links[i].find('liveHtml') >= 0 or links[i].find('wap') >= 0:
                continue
            title = response.xpath('//div[@class="list-pt"]/ul/li[@class="list-pt-li cf"]/a[not(@class)]/h3').xpath('string(.)').extract()[i]
            # day = response.xpath('//i[@class="date"]/text()').extract()[i]
            print(links[i],title)
            need_parse_next_page = self.is_news_not_saved(title,links[i])
            if not need_parse_next_page:
                break
            items.append(self.make_requests_from_url(links[i]).replace(callback = self.parse_item, meta ={'title':title}))
        if response.xpath('//li[@class="page-next"]/a/@href').extract_first():
            page_next = response.xpath('//li[@class="page-next"]/a/@href').extract_first()
            if need_parse_next_page:
                items.append(self.make_requests_from_url(page_next))
        return items
    def parse_item (self, response):
        log.msg("Start to parse news "+response.url, level=log.INFO)
        item = SpiderNewsAllItem()
        day = title = _type = keywords = url = article = ''
        url = response.url
        # title = response.xpath('//h1[@class="h1-news"]').xpath('string(.)').extract_first()
        # if title is None:
        #     title = response.xpath('//h1[@class="con_title"]/text()').extract_first()
        # title = title.strip()
        title = response.meta['title']
        day = response.xpath('//div[@class="bigPhoto-date yahei fs18 r"]').xpath('string(.)').extract_first()
        if day is None:
            day = response.xpath('//div[@class="fs18 share-date l"]/text()').extract_first()
        _type = "深圳新闻网"
        keywords = response.xpath('//meta[@name="keywords"]/@content').extract_first()
        article = response.xpath('//div[@class="article-content cf new_txt"]').xpath('string(.)').extract_first()
        if article is None:
            article = response.xpath('//div[@class="pic-content new_txt"]').xpath('string(.)').extract_first()
        if article is None:
            article = response.xpath('//*[@id="content"]').xpath('string(.)').extract_first()
        if article is not None:
            article = article.strip()
        item['title'] = title
        item['day'] = day
        item['_type'] = _type
        item['url'] = url
        item['keywords'] = keywords
        item['site'] = u'深圳新闻网'
        item['article'] = article
        return item
